In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
from warnings import filterwarnings 
filterwarnings("ignore")

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
In [2]:
data = pd.read_csv("C:\\Users\\laxma\\Downloads\\diabetes.csv")
data
Out[2]:
Pregnancies Glucose BloodPressure SkinThickness Insulin BMI DiabetesPedigreeFunction Age Outcome
0 6 148 72 35 0 33.6 0.627 50 1
1 1 85 66 29 0 26.6 0.351 31 0
2 8 183 64 0 0 23.3 0.672 32 1
3 1 89 66 23 94 28.1 0.167 21 0
4 0 137 40 35 168 43.1 2.288 33 1
... ... ... ... ... ... ... ... ... ...
763 10 101 76 48 180 32.9 0.171 63 0
764 2 122 70 27 0 36.8 0.340 27 0
765 5 121 72 23 112 26.2 0.245 30 0
766 1 126 60 0 0 30.1 0.349 47 1
767 1 93 70 31 0 30.4 0.315 23 0

768 rows × 9 columns

In [3]:
data.head()
Out[3]:
Pregnancies Glucose BloodPressure SkinThickness Insulin BMI DiabetesPedigreeFunction Age Outcome
0 6 148 72 35 0 33.6 0.627 50 1
1 1 85 66 29 0 26.6 0.351 31 0
2 8 183 64 0 0 23.3 0.672 32 1
3 1 89 66 23 94 28.1 0.167 21 0
4 0 137 40 35 168 43.1 2.288 33 1
In [4]:
data.tail()
Out[4]:
Pregnancies Glucose BloodPressure SkinThickness Insulin BMI DiabetesPedigreeFunction Age Outcome
763 10 101 76 48 180 32.9 0.171 63 0
764 2 122 70 27 0 36.8 0.340 27 0
765 5 121 72 23 112 26.2 0.245 30 0
766 1 126 60 0 0 30.1 0.349 47 1
767 1 93 70 31 0 30.4 0.315 23 0
In [5]:
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB
In [6]:
data.isnull().sum()
Out[6]:
Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64
In [7]:
data.duplicated().sum()
Out[7]:
0
In [8]:
data.Insulin.sum()
Out[8]:
61286
In [9]:
data.Insulin
Out[9]:
0        0
1        0
2        0
3       94
4      168
      ... 
763    180
764      0
765    112
766      0
767      0
Name: Insulin, Length: 768, dtype: int64
In [10]:
data.columns
Out[10]:
Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')
In [11]:
#VISUALIZATION
In [12]:
plt.bar(data['Outcome'],data['Age'])
plt.xticks(rotation=90)
plt.show()
In [13]:
fig=px.bar(data,x='Pregnancies',y='Outcome',color='Pregnancies')
fig.show()
In [14]:
fig=px.violin(data,x='SkinThickness',y='BMI',color='SkinThickness')
fig.show()
In [15]:
plt.figure(figsize=(10,4))
sns.countplot(x='Outcome', data=data, color='b')
plt.xticks(rotation=90)
plt.show()
In [16]:
plt.figure(figsize=(10,4))
top_car = data['DiabetesPedigreeFunction'].value_counts().nlargest(10)
sns.countplot(y=data.DiabetesPedigreeFunction, order=top_car.index, color='red')
Out[16]:
<AxesSubplot:xlabel='count', ylabel='DiabetesPedigreeFunction'>
In [17]:
sns.lineplot(x='BloodPressure', y='Age', data=data).set_title('Variation of Glucose with BMI')
Out[17]:
Text(0.5, 1.0, 'Variation of Glucose with BMI')
In [18]:
sns.barplot(data['Outcome'],data['SkinThickness'],color='r')
plt.xticks(rotation=90)
plt.show()
In [19]:
plt.figure(figsize=(8, 4))
sns.scatterplot(data=data, x='BMI', y='Age')
plt.title('BMI and there Age')
plt.xlabel('BMI')
plt.ylabel('Age')
plt.show()
In [20]:
sns.displot(data["Age"])
Out[20]:
<seaborn.axisgrid.FacetGrid at 0x24d89f93910>
In [21]:
sns.boxplot(x='BloodPressure',y='Glucose',data=data)
plt.xticks(rotation=90)
Out[21]:
(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
        34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46]),
 [Text(0, 0, '0'),
  Text(1, 0, '24'),
  Text(2, 0, '30'),
  Text(3, 0, '38'),
  Text(4, 0, '40'),
  Text(5, 0, '44'),
  Text(6, 0, '46'),
  Text(7, 0, '48'),
  Text(8, 0, '50'),
  Text(9, 0, '52'),
  Text(10, 0, '54'),
  Text(11, 0, '55'),
  Text(12, 0, '56'),
  Text(13, 0, '58'),
  Text(14, 0, '60'),
  Text(15, 0, '61'),
  Text(16, 0, '62'),
  Text(17, 0, '64'),
  Text(18, 0, '65'),
  Text(19, 0, '66'),
  Text(20, 0, '68'),
  Text(21, 0, '70'),
  Text(22, 0, '72'),
  Text(23, 0, '74'),
  Text(24, 0, '75'),
  Text(25, 0, '76'),
  Text(26, 0, '78'),
  Text(27, 0, '80'),
  Text(28, 0, '82'),
  Text(29, 0, '84'),
  Text(30, 0, '85'),
  Text(31, 0, '86'),
  Text(32, 0, '88'),
  Text(33, 0, '90'),
  Text(34, 0, '92'),
  Text(35, 0, '94'),
  Text(36, 0, '95'),
  Text(37, 0, '96'),
  Text(38, 0, '98'),
  Text(39, 0, '100'),
  Text(40, 0, '102'),
  Text(41, 0, '104'),
  Text(42, 0, '106'),
  Text(43, 0, '108'),
  Text(44, 0, '110'),
  Text(45, 0, '114'),
  Text(46, 0, '122')])
In [22]:
sns.violinplot(x='Pregnancies',y='Age',data=data)
Out[22]:
<AxesSubplot:xlabel='Pregnancies', ylabel='Age'>
In [23]:
#MODEL BUILDING
In [24]:
X = data.iloc[:,0:8]
y = data.iloc[:,8]

Xtr, Xte, ytr, yte = train_test_split(X, y, test_size=0.2, random_state=4)
In [25]:
sc = StandardScaler()

Xtr = sc.fit_transform(Xtr)
Xte = sc.fit_transform(Xte)
In [26]:
clf=KNeighborsClassifier(n_neighbors=11, p=2, metric='euclidean')
In [27]:
clf.fit(Xtr,ytr)
pred = clf.predict(Xte)
In [28]:
print(confusion_matrix(pred, yte))
[[88 25]
 [14 27]]
In [29]:
print(accuracy_score(pred, yte))
0.7467532467532467
In [ ]: